# Author: G. Schumacher
# Project: EUENGAGE
# Last updated: July 2017

# Purpose: Combining Dutch and Danish speeches


# Basic setup -------------------------------------------------------------
rm(list=ls())
library(quanteda)  # quanteda_1.4.0
library(car)
library(stringr)
library(readtext)
library(dataverse)

# Read in Danish data ---------------------------------------------------
# For this to work download all speeches from the dataverse https://doi.org/10.7910/DVN/V3FHI3 to a folder Danish Speeches.
temp <- readtext("Danish Speeches/*.pdf",docvarsfrom="filenames", dvsep="_",docvarnames=c("year","party","speaker", "number"))

# Connect with functions
functies <- read.csv("DN speeches   functions.csv", sep=";")
functies$File.name <- paste0(functies$File.name,".pdf")
colnames(functies)[1] <- "doc_id"
temp$doc_id[which(temp$doc_id%in%functies$doc_id==FALSE)]
temp <- merge(temp,functies,by="doc_id")

# Transform to corpus / dfm
text <- corpus(temp)
dk.text <- text
docvars(dk.text)$country <- "DK"

# Read in Dutch data ---------------------------------------------------
# For this to work download all speeches from the dataverse https://doi.org/10.7910/DVN/V3FHI3 to a folder Dutch Speeches.
temp <- readtext("Dutch Speeches/*.pdf",docvarsfrom="filenames", dvsep="_",docvarnames=c("year","party","speaker", "number"))

# some cleaning
temp2 <- temp[,2] 
temp3 <- sapply(temp2, gsub, pattern="�", replacement="e", fixed=TRUE)
temp3 <- sapply(temp3, gsub, pattern="�", replacement="a", fixed=TRUE)
temp3 <- sapply(temp3, gsub, pattern="�", replacement="o", fixed=TRUE)
temp3 <- sapply(temp3, gsub, pattern=":", replacement=" ", fixed=TRUE)
temp3 <- sapply(temp3, gsub, pattern="�", replacement="o", fixed=TRUE)
temp3 <- sapply(temp3, gsub, pattern="�", replacement="a", fixed=TRUE)
temp3 <- sapply(temp3, gsub, pattern="�", replacement="e", fixed=TRUE)
temp3 <- sapply(temp3, gsub, pattern="·", replacement=" ", fixed=TRUE)
temp3 <- sapply(temp3, gsub, pattern="'", replacement=" ", fixed=TRUE)

temp[,2] <- temp3

# Connect with functions
functies <- read.csv("functies sprekers.csv", sep=";")
functies$File.name <- paste0(functies$File.name,".pdf")
colnames(functies)[1] <- "doc_id"
temp$doc_id[which(temp$doc_id%in%functies$doc_id==FALSE)]
temp <- merge(temp,functies,by="doc_id")

# Transform to corpus
text <- corpus(temp)

nl.text <- text
docvars(nl.text)$country <- "NL"

# Combine data ------------------------------------------------------------
corpus <- dk.text + nl.text
docvars(corpus)$speech.no <- seq(1,ndoc(corpus),1)

# Combine with IPOD data ------------------------------------------------
load(url("https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/PE8TWP/OSIZCX"))

# Create additional variables ---------------------------------------------
party_id <- recode(docvars(corpus)$party, "'S'=1629;
                                           'V'=1605;
                                           'SF'=1644;
                                           'EL'=306;
                                           'DF'=1418;
                                           'ARP'=300;
                                           'CPN'=1194;
                                           'PvdA'=742;
                                           'VVD'=1409;
                                           'KVP'=451;
                                           'D66'=345;
                                           'CDA'=235;
                                           'CHU'=405;
                                           'Groenlinks'=756;
                                           'SP'=357;
                                           'PVV'=1501")
docvars(corpus)$party_id <- party_id
data_ipod <- data_ipod[,c(1,2,4,5,8,23,25:28,66,67,68,99,319,320,321,323,324,326,328:386)]
data_ipod$gain.elections <- NA
for(i in 2:dim(data_ipod)[1]){
  if(is.na(data_ipod$election_date[i])==FALSE & data_ipod$party_id[i]==data_ipod$party_id[i-1]){
    data_ipod$gain.elections[i] <- data_ipod$seat.share[i] - data_ipod$seat.share[i-1]
  }}

data_merged <- merge(docvars(corpus),data_ipod, by=c("party_id","year"),all.x=TRUE)
data_merged <- data_merged[-c(930,932),] # these were counted as doubles because there were 2 elections in 1953 (Denmark)

# Extend election data to non-election years. Logic is take value from last election
fill.up.function <- function(row, column){
    before.rows <- which(data_ipod$party_id==data_merged$party_id[row] & data_ipod$year < data_merged$year[row])
    selection <- data_ipod[before.rows,which(colnames(data_ipod)==colnames(data_merged)[column])]
    selection <- tail(selection[!is.na(selection)],1)
    return(selection)
}

for(i in 1:dim(data_merged)[1]){
  for(j in 17:90){ # was 38?
    if(is.na(data_merged[i,j])){
    temp <- fill.up.function(i,j)
  if(length(temp)>0){
    data_merged[i,j] <- temp
  }}}}  


data_merged$women <- ifelse(data_merged$speaker%in%c("Karen Dahlerup","Ebba Strange","Eva Hansen","Sage Hee" ,"Kirsten Jensen","Birthe Roenn Hornbech","Britta Schall Holberg","Elsebeth Kock-Petersen" , "Jette Gottlieb", "Litten Hansen","Inga Skjaerris Nielsen","Eva Kjer Hansen",
           "Helge Sander","Birte Weiss", "Helle Degn","Joanna Roenn", "Gro Harlem Brundtland","Karen Jespersen","Pauline Green", "Tonna Nilsson", "Pia Kjaersgaard","Jes Lunde",
           "Ulla Toernaes", "Anne-Marie Meldgaard","Christel Elgaard","Elise Pedersen" ,"Helle Thorning-Schmidt", "Inge Nesgaard","Lotte Bundsgaard","Lene Jensen" , "Mette Frederiksen",
           "Mette Svendsen", "Ritt Bjerregaard", "Anette Staehr", "Sofie Haestorp Andersen","Thyra Frank", "Anna Halaand Hansen", "Christel Schaldemose","Kathrine Alexandrowiz",
           "Pernille Blach Hansen","Dina Staal","Ayaan Hirsi Ali","Johanne Schmidt-Nielsen","Stine Brix","Line Barfod","Bente Kronborg Flensted", "Pernille Skipper","Klompe",
           "van Someren","Sint","Beckers","van Nieuwenhoven","Hamer","Ploumen", "Sargentini","Kant","Peetoom","Geel" , "Barth","Sap","Weening","Graper-van Koolwijk","Ollongren",
           "Demmers","In 't Veld2","Dekker","van Bijsterveldt","Bruines","Halsema","van Dijk","Brouwer","Wessel Tuinstra","Lambers","Cornelissen","van den Heuvel","van der Scheer"),1,0)

# Why are some missing
test <- apply(data_merged,1,function(x) length(which(is.na(x))))
summary <- tapply(test, list(data_merged$party,data_merged$year), mean)
apply(summary,1,function(x) which(x>20))
# 1945 = missing; CPN = missing. Years before parties are in parliament are missing.



# Write out ---------------------------------------------------------------
data_merged <- data_merged[order(data_merged$speech.no),]
docvars(corpus) <- data_merged[,15:91]
save(corpus, file="corpus_DK_NL.Rdata")
